In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
In [2]:
csv_df = pd.DataFrame.from_csv("marijuana-street-price-clean.csv",header=0,index_col=False,sep=',',parse_dates=[-1])
In [3]:
#csv_df.head
In [4]:
csv_df_sort = csv_df.sort(columns=['State','date'])
In [5]:
# let us count the number of entries in each column
In [6]:
csv_df_sort.count()
Out[6]:
In [7]:
csv_df_sort.dtypes
Out[7]:
In [8]:
# we see that LowQ does not match the number of entries and this a common problem in time series analysis
# lets fill up the Nan value with the last known best value so that we can work on continuing time series analysis
In [9]:
csv_df_sort_fillna_ffil = csv_df_sort.fillna(method='ffill')
In [10]:
csv_df_sort_fillna_ffil.count()
Out[10]:
In [11]:
#now we have cleaned data set to work with
In [12]:
global_mean_HighQ = csv_df_sort_fillna_ffil['HighQ'].mean()
In [13]:
global_mean_HighQ
Out[13]:
In [14]:
#now that we have the global mean, we can find how far the values of mean are for each state from global mean
In [15]:
print type(csv_df_sort_fillna_ffil.head())
In [16]:
#group_by_state.get_group(('Alabama'))
# to check values for a given group
# now lets use agg function to get generate Mean,Median,Standard deviation,Variance and covariance for every state
In [17]:
basic_measures_state_wise_df = pd.DataFrame()
In [18]:
#lets get all the unique states in the data frame and construct the measures for each state
# normal method
'''
unique_states = pd.unique(csv_df_sort_fillna_ffil['State'].ravel())
for state in unique_states:
state_df = csv_df_sort_fillna_ffil.loc[csv_df_sort_fillna_ffil['State'] == state]
highq_df = state_df.groupby(['State'])['HighQ'].agg([np.mean,np.median,np.std,np.var,np.cov])
highq_df.head()
lowq_df = state_df.groupby(['State'],)['LowQ'].agg([np.mean,np.median,np.std,np.var,np.cov])
highq_df = highq_df.rename(columns={'mean':'HighQ_Mean','median':'HighQ_Median','std':'HighQ_Std','var':'HighQ_Var','cov':'HighQ_Cov'})
lowq_df = lowq_df.rename(columns={'mean':'LowQ_Mean','median':'LowQ_Median','std':'LowQ_Std','var':'LowQ_Var','cov':'LowQ_Cov'})
highq_df['State'] = state
lowq_df['State'] = state
break
'''
Out[18]:
In [19]:
csv_df_sort_fillna_ffil.head()
Out[19]:
In [20]:
statistics_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['State'],as_index=False).aggregate\
({'HighQ': {'HighQ_Mean': np.mean,'HighQ_Median':np.median,'HighQ_Mode':stats.mode,'HighQ_Std':np.std,'HighQ_Var':np.var,'HighQ_Covar':np.cov},\
'LowQ':{'LowQ_Mean':np.mean,'LowQ_Median':np.median,'LowQ_Mode':stats.mode,'LowQ_Std':np.std,'LowQ_Var':np.var,'LowQ_Covar':np.cov}}))
print type(statistics_df)
statistics_df = statistics_df.rename(columns={'':'State'})
print type(statistics_df)
statistics_df.columns = statistics_df.columns.droplevel(0)
statistics_df.head()
Out[20]:
In [21]:
#let us now read demographics data into a dataframe
In [22]:
demographics_df = pd.DataFrame.from_csv("Demographics_State.csv",header=0,index_col=False,sep=',')
In [23]:
demographics_df.head()
Out[23]:
In [24]:
demographics_df.shape
Out[24]:
In [25]:
demographics_df.columns
Out[25]:
In [26]:
#let us now read population data into a dataframe
In [27]:
population_df = pd.DataFrame.from_csv("Population_State.csv",header=0,index_col=False,sep=",")
In [28]:
population_df.head()
Out[28]:
In [29]:
population_df.shape
Out[29]:
In [30]:
population_df.describe()
Out[30]:
In [31]:
population_df.columns
Out[31]:
In [32]:
#we are now merging demographic and population data into one single data frame
#to do this , we are using pandas merge method which allows us to join two data frames like how we do it in sql
# we are using inner join and on the column region
In [33]:
population_demographic_merge_df = pd.merge(demographics_df,population_df,how='inner',on='region')
In [34]:
population_demographic_merge_df.head()
Out[34]:
In [35]:
population_demographic_merge_df.shape
Out[35]:
In [36]:
population_demographic_merge_df = population_demographic_merge_df.rename(columns={'region':'State'})
In [37]:
population_demographic_merge_df.head()
Out[37]:
In [38]:
statistics_population_demographic_merge_df = pd.merge(population_demographic_merge_df,statistics_df,how='inner',on='State')
In [39]:
statistics_population_demographic_merge_df.head()
Out[39]:
In [40]:
#the reason why there are no entries here is because in the population demographic df the States are in lower case
# let us now change the States to lower case in statistics df too
In [41]:
statistics_df.head()
Out[41]:
In [42]:
statistics_df['State'] = statistics_df['State'].str.lower()
statistics_df.head()
Out[42]:
In [43]:
#now lets merge the data again, the reason we are merging all this data is we are able to look at one state on multiple levels.
In [44]:
stats_pop_dem_merge_df = pd.merge(population_demographic_merge_df,statistics_df,how='inner',on='State')
In [45]:
stats_pop_dem_merge_df.head()
Out[45]:
In [46]:
stats_pop_dem_merge_df.shape
stats_pop_dem_merge_df.columns
Out[46]:
In [47]:
#so now we have data for 51 states with 20 feature columns.
#The data is ready
In [48]:
white_greater_black = stats_pop_dem_merge_df[stats_pop_dem_merge_df.percent_white > stats_pop_dem_merge_df.percent_black]
black_greate_white = stats_pop_dem_merge_df[stats_pop_dem_merge_df.percent_white < stats_pop_dem_merge_df.percent_black]
In [49]:
white_greater_black.shape,black_greate_white.shape
Out[49]:
In [50]:
#given that the white population is high in almost every state, lets not do any comparison use the
#percent of whites and blacks
In [51]:
#let us see if there is a correlation between per capita income and highQ_mean/lowQ_mean
correlation_btw_per_capita_highQ = stats.pearsonr(white_greater_black.per_capita_income,white_greater_black.HighQ_Mean)[0]
correlation_btw_per_capita_lowQ = stats.pearsonr(white_greater_black.per_capita_income,white_greater_black.LowQ_Mean)[0]
In [52]:
correlation_btw_per_capita_highQ,correlation_btw_per_capita_lowQ
Out[52]:
In [53]:
# using population data let us see if we can correlate total population with that of price
population_correlation = stats.pearsonr(stats_pop_dem_merge_df.total_population,stats_pop_dem_merge_df.HighQ_Mean)[0]
population_correlation
# we see that it is iversely correlated to an extent which implies greater the population lesser is the price
Out[53]:
In [54]:
# let us see if there is a big difference in the value of High quality
# weed prices based on states selling it legally and those that dont
legal_states = ['colorado']
illegal_states = ['wyoming']
legal_states_mean = stats_pop_dem_merge_df[stats_pop_dem_merge_df['State'].isin(legal_states)]['HighQ_Mean']
illegal_states_mean = stats_pop_dem_merge_df[stats_pop_dem_merge_df['State'].isin(illegal_states)]['HighQ_Mean']
legal_states_mean = legal_states_mean.reset_index()
illegal_states_mean = illegal_states_mean.reset_index()
print legal_states_mean.head()
print illegal_states_mean.head()
# we see that legal states sell highQ marijuana at a lesser price
In [55]:
# from this we understand that the value of low quality weed is somewhat affected by per capita income compared to
# high quality weed
# we can use the same way to compute correlations amognst other attributes too
In [56]:
#before we go any further , lets try to find some interesting growth in weed prices for each state
csv_df_sort_fillna_ffil.head()
Out[56]:
In [57]:
country_mean_highQ = csv_df_sort_fillna_ffil.groupby(['date'],as_index=False).aggregate({'HighQ':np.mean})
In [58]:
country_mean_highQ.head()
Out[58]:
In [59]:
country_mean_highQ.min()
Out[59]:
In [60]:
#we just have dates, we need to generate calendar week for each date
In [61]:
from datetime import datetime
# date format in %Y-%m-%d
def get_calendar_week(date):
calendar_week = date.isocalendar()[1]
return int(calendar_week)
In [ ]:
In [ ]:
In [62]:
# lets add calendar week to the data frame
csv_df_sort_fillna_ffil['Calendar_Week'] = csv_df_sort_fillna_ffil['date'].apply(get_calendar_week)
In [63]:
#lets check if it looks fine
csv_df_sort_fillna_ffil.head()
Out[63]:
In [64]:
#now let us try to group at week level and see during which week it was the lowest and for a given state
country_week_highQ_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['Calendar_Week'],as_index=False).aggregate({'HighQ':{'HighQ_Mean':np.mean}}))
country_week_highQ_df.columns = country_week_highQ_df.columns.droplevel(0)
country_week_highQ_df = country_week_highQ_df.rename(columns={'':'Calendar_Week'})
country_week_highQ_df.head()
print country_week_highQ_df.min(),country_week_highQ_df.max()
In [65]:
#now let us check on which day it was the lowest across country
country_day_highQ_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['date'],as_index=False).aggregate({'HighQ':{'HighQ_Mean':np.mean}}))
country_day_highQ_df.columns = country_day_highQ_df.columns.droplevel(0)
country_day_highQ_df = country_day_highQ_df.rename(columns={'':'date'})
country_day_highQ_df.head()
print country_day_highQ_df.min(),country_day_highQ_df.max()
In [66]:
# now let us check when people buy the most across the country
country_week_highQN_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['Calendar_Week'],as_index=False).aggregate({'HighQN':{'HighQN_Mean':np.mean}}))
country_week_highQN_df.columns = country_week_highQN_df.columns.droplevel(0)
country_week_highQN_df = country_week_highQN_df.rename(columns={'':'Calendar_Week'})
country_week_highQN_df.head()
print country_week_highQN_df.min(),country_week_highQN_df.max()
In [67]:
# The above set of statements work across the country based on date and calendar week
# Now let us look at what happens every month by generating calendar week for a month and then see during which parts
# of the month the price is at its highest/lowest
In [68]:
def get_calendar_week_month(date):
calendar_week_year = get_calendar_week(date)
month = date.month
year = date.year
day = date.day
start_of_month = str(year)+"-"+str(month)+"-"+"01"
calendar_week_start_of_month = date.isocalendar()[1]
# this happens only for dec
if(calendar_week_year > calendar_week_start_of_month):
return calendar_week_year - calendar_week_start_of_month + 1
else:
return calendar_week_year
In [69]:
# the function to generate calendar_Week_month works
get_calendar_week_month(datetime.strptime("2013-12-30","%Y-%m-%d"))
Out[69]:
In [70]:
csv_df_sort_fillna_ffil['Calendar_Week_Month'] = csv_df_sort_fillna_ffil['date'].apply(get_calendar_week_month)
In [71]:
csv_df_sort_fillna_ffil.head()
Out[71]:
In [72]:
#now let us try to group at week level and see during which week it was the lowest and for a given state
country_month_week_highQ_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['Calendar_Week_Month'],as_index=False).aggregate({'HighQ':{'HighQ_Mean':np.mean}}))
country_month_week_highQ_df.columns = country_month_week_highQ_df.columns.droplevel(0)
country_month_week_highQ_df = country_month_week_highQ_df.rename(columns={'':'Calendar_Week_Month'})
country_month_week_highQ_df.head()
print country_month_week_highQ_df.min()
print country_month_week_highQ_df.max()
In [73]:
csv_df_sort_fillna_ffil.columns
Out[73]:
In [74]:
#now we have some insights into the data , let us take the next step and use a liner regression model to predict the
# prices of marijuana for the state alabama
columns_required = ['State','HighQ','date']
alabama_state = csv_df_sort_fillna_ffil[csv_df_sort_fillna_ffil['State'] == "Alabama"]
alabama_state_price_date = alabama_state.ix[:,columns_required]
alabama_state_price_date.dtypes
Out[74]:
In [75]:
train,test = train_test_split(alabama_state_price_date,test_size = 0.2)
In [76]:
def changeDateToOrdinal(date):
return date.toordinal()
In [77]:
train_pd = pd.DataFrame(train,columns=['State','Price','Date'])
train_pd['State'] = 1
train_pd['Price'] = train_pd['Price'].astype(float)
train_pd['Date_ord'] = train_pd['Date'].apply(changeDateToOrdinal)
train_pd.head()
Out[77]:
In [78]:
train_pd.head()
Out[78]:
In [79]:
test_pd = pd.DataFrame(test,columns=['State','Price','Date'])
test_pd['State'] =1
test_pd['Price'] = test_pd['Price'].astype(float)
test_pd['Date_ord'] = test_pd['Date'].apply(changeDateToOrdinal)
In [80]:
print train_pd.head()
train_pd.dtypes
print test_pd.head()
test_pd.dtypes
Out[80]:
In [81]:
x_train = train_pd.ix[:,['State','Date_ord']]
y_train = train_pd.ix[:,['Price']]
print x_train.dtypes,y_train.dtypes
x_test = test_pd.ix[:,['State','Date_ord']]
y_test = test_pd.ix[:,['Price']]
x_train.head()
Out[81]:
In [82]:
y_train.head()
y_train.dtypes
Out[82]:
In [83]:
print x_train.head()
x_train.dtypes
Out[83]:
In [84]:
ols = linear_model.LinearRegression(normalize=True, fit_intercept=True)
ols.fit(x_train, y_train, n_jobs=-1)
print ols.coef_
ols_predict = ols.predict(x_test)
#print ols_predict
#Setting 0.5 as the decision boundary. If values are above 0.5, the prediction is set to 1. This means the record belongs to class 2. Else, the prediction is set to 0
test_pd['Predicted_Price'] = ols_predict
In [85]:
test_pd.head()
Out[85]:
In [ ]: